Churn prediction on two synthetic datasets¶

In this project I'll explore datasets on staff retention

By using different machine learning models I'll predict if an employee is or not about to leave the organization

tools used:

  1. Importing Datasets as csv with pandas
  2. Data cleaning with pandas
  3. Feature selection using Random Forest
  4. Resampling techniques for class imbalance
  5. encoding using Leave one out encoder
  6. modelling using 5 different models: KNN, Random Forest, Ada boosting, Random Forest, and Naive Bayes
In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.metrics import confusion_matrix as confucio
from sklearn.model_selection import train_test_split
import numpy as np
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
import category_encoders as ce
from collections import Counter


def evaluate_model(dt_classifier):
    print("Train Accuracy :", accuracy_score(y_train, dt_classifier.predict(X_train)))
    print("Train Confusion Matrix:")
    print(confucio(y_train, dt_classifier.predict(X_train)))
    print("-"*50)
    print("Test Accuracy :", accuracy_score(y_test, dt_classifier.predict(X_test)))
    print("Test Confusion Matrix:")
    print(confucio(y_test, dt_classifier.predict(X_test)))

Dataset 1¶

In [ ]:
#DATASET 1

df = pd.read_csv('C:/Users/santi/OneDrive/Desktop/HR_comma_sep.csv')
df.columns

#distinct values for some categorical data
unique_list = []
for item in df['salary']: 
    if item not in unique_list: 
        unique_list.append(item) 

print(unique_list)
len(unique_list)

#Some data cleaning
#renaming columns names
df = df.rename(str.lower, axis='columns')
df = df.rename(columns={'sales': 'department'})
df.columns
      
#checking null values and shape
df.shape
df = df.dropna(axis=0)
df.isna().sum()
['low', 'medium', 'high']
Out[ ]:
satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
work_accident            0
left                     0
promotion_last_5years    0
department               0
salary                   0
dtype: int64

Plotting categories and distributions¶

In [ ]:
#Genreate PLOTS 

df['left']=df['left'].replace(0,'No')
df['left']=df['left'].replace(1,'Yes')

g = sns.countplot(data=df, x="left")
g.set_xlabel('left',fontsize=15)
g.set_ylabel('Count',fontsize=15)
g.tick_params(labelsize=11)
g.set_title('Dataset 1 Attrition count plot')
plt.show()

df['left']=df['left'].replace('No',0)
df['left']=df['left'].replace('Yes',1) 
df['department']=df['department'].replace('sales',0)
df['department']=df['department'].replace('accounting',1)
df['department']=df['department'].replace('hr',2)
df['department']=df['department'].replace('technical',3)
df['department']=df['department'].replace('support',4)
df['department']=df['department'].replace('management',5)
df['department']=df['department'].replace('IT',6)
df['department']=df['department'].replace('product_mng',7)
df['department']=df['department'].replace('marketing',8)
df['department']=df['department'].replace('RandD',9)
df['salary']=df['salary'].replace('low',1) 
df['salary']=df['salary'].replace('medium',2) 
df['salary']=df['salary'].replace('high',3) 

#ditribution plot

df1 = df.iloc[: , :6]
df2 = df.iloc[: , 6:12]
sns.color_palette("tab10")

n_rows=2
n_cols=3
palette = ('dodgerblue' ,'red','blue' ,'orange','black' ,'purple')
palette = iter(palette)
# Create the subplots
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols,figsize=(11, 8))
for i, column in enumerate(df1.columns):
     c = next(palette)
     sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c)

n_rows=2
n_cols=3
palette = ('green' ,'black','gold' ,'magenta','cyan' ,'deeppink')
palette = iter(palette)
# Create the subplots
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols,figsize=(11, 8))
for i, column in enumerate(df2.columns):
     c = next(palette)
     sns.distplot(df2[column],ax=axes[i//n_cols,i%n_cols],color=c)
     
df3d = df.sample(n=5000)

#3D plot
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(projection='3d')
scatter = ax.scatter(df3d['satisfaction_level'],
           df3d['salary'], 
           df3d['average_montly_hours'], 
           s = df3d['satisfaction_level']*(200), 
           c = df3d['left'],
    cmap='viridis', alpha=0.3,linewidth=1.5,edgecolors='black')
ax.set_xlabel('satisfaction level', fontsize=20)
ax.set_ylabel('salary', fontsize=20)
ax.set_zlabel('montly hours', fontsize=20)
handles, labels = scatter.legend_elements(prop="colors", alpha=1)
legend = ax.legend(handles, labels, loc='upper left',fontsize = 17,shadow = True)
legend.set_title('left',prop={'size':'large'})
ax.set_title('* points Sized by satisfaction level ')

plt.show()

# this next plot is taking a while to load
sns.pairplot(df3d,hue='left')
plt.show() 

df['department']=df['department'].replace(0,'sales')
df['department']=df['department'].replace(1,'accounting')
df['department']=df['department'].replace(2,'hr')
df['department']=df['department'].replace(3,'technical')
df['department']=df['department'].replace(4,'support')
df['department']=df['department'].replace(5,'management')
df['department']=df['department'].replace(6,'IT')
df['department']=df['department'].replace(7,'product_mng')
df['department']=df['department'].replace(8,'marketing')
df['department']=df['department'].replace(9,'RandD')
df['salary']=df['salary'].replace(1,'low') 
df['salary']=df['salary'].replace(2,'medium') 
df['salary']=df['salary'].replace(3,'high') 

df.isna().sum
C:\Users\santi\AppData\Local\Temp\ipykernel_17924\3107865022.py:43: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c)
C:\Users\santi\AppData\Local\Temp\ipykernel_17924\3107865022.py:43: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c)
C:\Users\santi\AppData\Local\Temp\ipykernel_17924\3107865022.py:43: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c)
C:\Users\santi\AppData\Local\Temp\ipykernel_17924\3107865022.py:43: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c)
C:\Users\santi\AppData\Local\Temp\ipykernel_17924\3107865022.py:43: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c)
C:\Users\santi\AppData\Local\Temp\ipykernel_17924\3107865022.py:43: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c)
C:\Users\santi\AppData\Local\Temp\ipykernel_17924\3107865022.py:53: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df2[column],ax=axes[i//n_cols,i%n_cols],color=c)
C:\Users\santi\AppData\Local\Temp\ipykernel_17924\3107865022.py:53: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df2[column],ax=axes[i//n_cols,i%n_cols],color=c)
C:\Users\santi\AppData\Local\Temp\ipykernel_17924\3107865022.py:53: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df2[column],ax=axes[i//n_cols,i%n_cols],color=c)
C:\Users\santi\AppData\Local\Temp\ipykernel_17924\3107865022.py:53: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df2[column],ax=axes[i//n_cols,i%n_cols],color=c)
Out[ ]:
<bound method NDFrame._add_numeric_operations.<locals>.sum of        satisfaction_level  last_evaluation  number_project  \
0                   False            False           False   
1                   False            False           False   
2                   False            False           False   
3                   False            False           False   
4                   False            False           False   
...                   ...              ...             ...   
14994               False            False           False   
14995               False            False           False   
14996               False            False           False   
14997               False            False           False   
14998               False            False           False   

       average_montly_hours  time_spend_company  work_accident   left  \
0                     False               False          False  False   
1                     False               False          False  False   
2                     False               False          False  False   
3                     False               False          False  False   
4                     False               False          False  False   
...                     ...                 ...            ...    ...   
14994                 False               False          False  False   
14995                 False               False          False  False   
14996                 False               False          False  False   
14997                 False               False          False  False   
14998                 False               False          False  False   

       promotion_last_5years  department  salary  
0                      False       False   False  
1                      False       False   False  
2                      False       False   False  
3                      False       False   False  
4                      False       False   False  
...                      ...         ...     ...  
14994                  False       False   False  
14995                  False       False   False  
14996                  False       False   False  
14997                  False       False   False  
14998                  False       False   False  

[14999 rows x 10 columns]>

Leave one out encoder¶

In [ ]:
# x and y
y = df['left']
x = df.drop(columns=['left'])
print(x.columns)

import category_encoders as ce
#encoder
ce = ce.LeaveOneOutEncoder(cols=['work_accident','promotion_last_5years', 'department', 'salary'],return_df=True)
x = ce.fit_transform(x,y)
Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'work_accident',
       'promotion_last_5years', 'department', 'salary'],
      dtype='object')

Feature selection, train test split , multicolinearity and Oversampling¶

In [ ]:
#Feature selection and splitting

# decision tree for feature importance on a classification problem
model = RandomForestClassifier()
#fit
model.fit(x, y)
# get importance
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

x = x[['promotion_last_5years','work_accident','satisfaction_level']]


X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.30)

#ADASYN Oversampling
print("Before sampling: ", Counter(y_train))
from imblearn.over_sampling import ADASYN
ada = ADASYN(random_state=42)
X_train, y_train = ada.fit_resample(x, y)
print("After ADASYN: ", Counter(y_train))

cor = X_train.corr()
plt.figure(figsize=(12,10))
sns.heatmap(cor, cmap=plt.cm.CMRmap_r,annot=True)
plt.show()  
Feature: 0, Score: 0.06910
Feature: 1, Score: 0.01024
Feature: 2, Score: 0.03327
Feature: 3, Score: 0.01428
Feature: 4, Score: 0.02712
Feature: 5, Score: 0.25458
Feature: 6, Score: 0.56574
Feature: 7, Score: 0.00057
Feature: 8, Score: 0.02510
Before sampling:  Counter({0: 7958, 1: 2541})
After ADASYN:  Counter({1: 11429, 0: 11428})

Modelling¶

ADA Boost

In [ ]:
#Modelling

#ADA
# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=50,
                         learning_rate=1)
# Train Adaboost Classifer
model = abc.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = model.predict(X_test)


#train model with cv of 10
cv_scores = cross_val_score(abc, X_train, y_train, cv=10)
#print each cv score (accuracy) and average them
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))



ADA = {'model': 'ADA' ,'precision':precision_score(y_test, y_pred, average='macro'),
 'recall':recall_score(y_test, y_pred, average='macro'),
 'accuracy': accuracy_score(y_test, y_pred),
 'f1': f1_score(y_test, y_pred , average = 'binary'),
  'ROC AUC': roc_auc_score(y_test, y_pred) }

CM_ADA = confucio(y_test, y_pred)

print('ADA')
evaluate_model(abc)
[1.         1.         1.         0.99912511 1.         1.
 1.         1.         1.         1.        ]
cv_scores mean:0.999912510936133
ADA
Train Accuracy : 1.0
Train Confusion Matrix:
[[11428     0]
 [    0 11429]]
--------------------------------------------------
Test Accuracy : 1.0
Test Confusion Matrix:
[[3470    0]
 [   0 1030]]

KNN

In [ ]:
#knn

knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))


#Checking for k value
neighbors = np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

# Loop over K values
for i, k in enumerate(neighbors):
	knn = KNeighborsClassifier(n_neighbors=k)
	knn.fit(X_train, y_train)
	
	# Compute training and test data accuracy
	train_accuracy[i] = knn.score(X_train, y_train)
	test_accuracy[i] = knn.score(X_test, y_test)

# Generate plot
plt.plot(neighbors, test_accuracy, label = 'Testing dataset Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training dataset Accuracy')
plt.legend()
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy')
plt.show()


#cv
#create a new KNN model
knn_cv = KNeighborsClassifier(n_neighbors=1)
#train model with cv of 5 
cv_scores = cross_val_score(knn_cv, X_train, y_train, cv=10)
#print each cv score (accuracy) and average them
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))


KNN = {'model': 'KNN' ,'precision':precision_score(y_test, y_pred, average='macro'),
 'recall':recall_score(y_test, y_pred, average='macro'),
 'accuracy': accuracy_score(y_test, y_pred),
 'f1': f1_score(y_test, y_pred , average = 'binary'),
  'ROC AUC': roc_auc_score(y_test, y_pred) }

CM_KNN = confucio(y_test, y_pred)

print('KNN')
evaluate_model(knn)
0.9997777777777778
[0.99956255 0.99781277 0.99868766 0.99956255 0.92650919 0.96456693
 0.97156605 1.         0.99956236 1.        ]
cv_scores mean:0.9857830079773946
KNN
Train Accuracy : 0.991206195038719
Train Confusion Matrix:
[[11405    23]
 [  178 11251]]
--------------------------------------------------
Test Accuracy : 0.9982222222222222
Test Confusion Matrix:
[[3463    7]
 [   1 1029]]

Naive Bayes

In [ ]:
#Naive Bayes
nb_model = Pipeline([
        ('classification', GaussianNB())
    ])
nb_model.get_params().keys()
nb_clf = GridSearchCV(estimator=nb_model, param_grid={}, scoring='recall', cv=5)
nb_clf.fit(X_train,y_train )


y_pred = nb_clf.predict(X_test)
model_nb_cm = confucio(y_test, y_pred)
model_nb_result = []
model_nb_result.append(precision_score(y_test, y_pred, average='macro'))
model_nb_result.append(recall_score(y_test, y_pred, average='macro'))
model_nb_result.append(accuracy_score(y_test, y_pred))
model_nb_result.append(f1_score(y_test, y_pred , average = 'binary'))


NB = {'model': 'NB' ,'precision':precision_score(y_test, y_pred, average='macro'),
 'recall':recall_score(y_test, y_pred, average='macro'),
 'accuracy': accuracy_score(y_test, y_pred),
 'f1': f1_score(y_test, y_pred , average = 'binary'),
  'ROC AUC': roc_auc_score(y_test, y_pred) }

CM_NB = confucio(y_test, y_pred)
print('NB')
evaluate_model(nb_clf)
NB
Train Accuracy : 0.6638666491665572
Train Confusion Matrix:
[[8694 2734]
 [4949 6480]]
--------------------------------------------------
Test Accuracy : 0.6928888888888889
Test Confusion Matrix:
[[2592  878]
 [ 504  526]]

Decision Tree

In [ ]:
#Decision tree
clf = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=7)
clf = clf.fit(X_train, y_train)

tree.plot_tree(clf)

y_pred = clf.predict(X_test)

TREE = {'model': 'TREE' ,'precision':precision_score(y_test, y_pred, average='macro'),
 'recall':recall_score(y_test, y_pred, average='macro'),
 'accuracy': accuracy_score(y_test, y_pred),
 'f1': f1_score(y_test, y_pred , average = 'binary'),
  'ROC AUC': roc_auc_score(y_test, y_pred) }

CM_TREE = confucio(y_test, y_pred)
print('tree')
evaluate_model(clf)
tree
Train Accuracy : 1.0
Train Confusion Matrix:
[[11428     0]
 [    0 11429]]
--------------------------------------------------
Test Accuracy : 1.0
Test Confusion Matrix:
[[3470    0]
 [   0 1030]]

Random Forest

In [ ]:
#Random forest
dt = RandomForestClassifier(random_state=42)

params = {
    'max_depth': [1,2,3,4,5,6,7,8,9,10,15, 20],
    'min_samples_leaf': [1,2,3,4,5,6,7,8,9, 10,15, 20, 50, 100],
    'criterion': ["gini", "entropy"]
}
grid_search = GridSearchCV(estimator=dt, 
                           param_grid=params, 
                           cv=5, n_jobs=-1, verbose=1, scoring = "accuracy")

grid_search.fit(X_train, y_train)
score_df = pd.DataFrame(grid_search.cv_results_)
score_df.head()
score_df.nlargest(5,"mean_test_score")
dt_best = grid_search.best_estimator_



FOREST = {'model': 'FOREST' ,'precision':precision_score(y_test, y_pred, average='macro'),
 'recall':recall_score(y_test, y_pred, average='macro'),
 'accuracy': accuracy_score(y_test, y_pred),
 'f1': f1_score(y_test, y_pred , average = 'binary'),
  'ROC AUC': roc_auc_score(y_test, y_pred) }

print('Forest')
evaluate_model(grid_search)
Fitting 5 folds for each of 336 candidates, totalling 1680 fits
Forest
Train Accuracy : 1.0
Train Confusion Matrix:
[[11428     0]
 [    0 11429]]
--------------------------------------------------
Test Accuracy : 1.0
Test Confusion Matrix:
[[3470    0]
 [   0 1030]]

Plotting results from models

In [ ]:
#Result dataset for dataset 1

TREE = pd.DataFrame(TREE, index=[0])
FOREST =pd.DataFrame(FOREST,index=[0])
NB = pd.DataFrame(NB,index=[0])
KNN = pd.DataFrame(KNN,index=[0])
ADA = pd.DataFrame(ADA,index=[0])

results = pd.concat([TREE, FOREST], axis=0)
results = pd.concat([results, NB], axis=0)
results = pd.concat([results, KNN], axis=0)
results = pd.concat([results, ADA], axis=0)

results['dataset'] = 'dataset 1'
print(results)
    model  precision    recall  accuracy       f1   ROC AUC    dataset
0    TREE   1.000000  1.000000  1.000000  1.00000  1.000000  dataset 1
0  FOREST   1.000000  1.000000  1.000000  1.00000  1.000000  dataset 1
0      NB   0.605927  0.628827  0.692889  0.43221  0.628827  dataset 1
0     KNN   1.000000  1.000000  1.000000  1.00000  1.000000  dataset 1
0     ADA   1.000000  1.000000  1.000000  1.00000  1.000000  dataset 1

Dataset 2¶

In [ ]:
#Dataset 2

data = pd.read_csv('C:/Users/santi/OneDrive/Desktop/IBM.csv')
data.columns

#checking values
for x in data.columns:
 print(data[x].value_counts())
 
#checking null values
for x in data.columns:
    a = data[x].isna().sum()
    if a>0:
        print(x  + '   ' + a)
        
data = data.drop(columns=['EmployeeID', 'recorddate_key', 'birthdate_key', 'orighiredate_key',
       'terminationdate_key',
       'gender_full', 'termreason_desc', 'termtype_desc', 'STATUS_YEAR'])
1318    10
5169    10
5155    10
5157    10
5158    10
        ..
2568     1
2575     1
2578     1
2579     1
8264     1
Name: EmployeeID, Length: 6284, dtype: int64
12/31/2013 0:00     5215
12/31/2012 0:00     5101
12/31/2011 0:00     4972
12/31/2014 0:00     4962
12/31/2010 0:00     4840
                    ... 
09/01/2011 00:00       3
04/01/2015 00:00       3
08/01/2012 00:00       2
06/01/2014 00:00       2
07/01/2014 00:00       2
Name: recorddate_key, Length: 130, dtype: int64
3/23/1973     40
08/04/1954    40
4/27/1956     40
03/06/1956    30
7/13/1972     30
              ..
9/14/1941      1
09/12/1941     1
09/01/1941     1
8/29/1941      1
6/13/1994      1
Name: birthdate_key, Length: 5342, dtype: int64
9/25/2006     50
08/09/1992    50
2/26/2006     50
10/16/2005    50
12/04/2004    50
              ..
06/02/1993     1
07/09/1997     1
7/24/1997      1
7/25/1997      1
8/27/2013      1
Name: orighiredate_key, Length: 4415, dtype: int64
01/01/1900    42450
12/30/2014     1079
12/30/2015      674
12/30/2010       25
11/11/2012       21
              ...  
11/13/2006        1
10/31/2006        1
10/30/2006        1
10/03/2006        1
9/14/2013         1
Name: terminationdate_key, Length: 1055, dtype: int64
27    1235
29    1227
28    1225
50    1218
30    1212
26    1210
51    1207
25    1197
49    1196
35    1189
34    1188
53    1188
52    1188
48    1180
36    1176
47    1173
55    1168
33    1164
46    1161
54    1159
44    1157
38    1156
56    1154
32    1153
42    1152
43    1150
37    1149
31    1146
39    1142
45    1141
41    1135
58    1130
57    1130
40    1130
59    1128
24    1111
60    1109
23     960
22     815
61     757
62     712
21     703
63     667
64     646
65     593
20     408
19     158
Name: age, dtype: int64
13    2885
12    2567
8     2559
11    2482
10    2432
9     2381
7     2341
6     2294
3     2270
4     2262
5     2258
2     2257
1     2222
14    2203
15    2192
16    2160
17    2066
0     1962
18    1829
19    1656
20    1322
21    1047
22     830
23     608
24     433
25     121
26      14
Name: length_of_service, dtype: int64
Vancouver           11211
Victoria             4885
Nanaimo              3876
New Westminster      3211
Kelowna              2513
Burnaby              2067
Kamloops             2061
Prince George        2048
Cranbrook            1785
Surrey               1560
Richmond             1401
Terrace              1228
Chilliwack           1167
Trail                 925
Langley               901
Vernon                898
Squamish              806
Quesnel               703
Abbotsford            681
North Vancouver       648
Fort St John          621
Williams Lake         617
West Vancouver        613
Port Coquitlam        545
Aldergrove            520
Fort Nelson           322
Nelson                317
New Westminister      254
Grand Forks           236
White Rock            231
Haney                 182
Princeton             136
Dawson Creek          129
Bella Bella           126
Ocean Falls            65
Pitt Meadows           57
Cortes Island          43
Valemount              37
Dease Lake             18
Blue River              9
Name: city_name, dtype: int64
Meats                     10269
Dairy                      8599
Produce                    8515
Bakery                     8381
Customer Service           7122
Processed Foods            5911
Store Management            271
Executive                   100
Recruitment                  72
HR Technology                64
Accounting                   59
Employee Records             44
Accounts Receiveable         39
Labor Relations              34
Accounts Payable             34
Training                     30
Compensation                 24
Audit                        24
Investment                   24
Information Technology       20
Legal                        17
Name: department_name, dtype: int64
Meat Cutter                        9984
Dairy Person                       8590
Produce Clerk                      8237
Baker                              8096
Cashier                            6816
Shelf Stocker                      5622
Customer Service Manager            306
Processed Foods Manager             289
Bakery Manager                      285
Meats Manager                       285
Produce Manager                     278
Store Manager                       271
Recruiter                            62
HRIS Analyst                         55
Accounting Clerk                     50
Benefits Admin                       35
Labor Relations Analyst              30
Accounts Receiveable Clerk           30
Trainer                              26
Accounts Payable Clerk               25
Auditor                              20
Systems Analyst                      20
Investment Analyst                   20
Compensation Analyst                 20
Corporate Lawyer                     17
CEO                                  10
Exec Assistant, VP Stores            10
Legal Counsel                        10
VP Stores                            10
VP Human Resources                   10
VP Finance                           10
Director, Recruitment                10
Exec Assistant, Finance              10
Exec Assistant, Human Resources      10
CHief Information Officer            10
Exec Assistant, Legal Counsel        10
Director, Accounts Payable            9
Director, Accounts Receivable         9
Director, Employee Records            9
Director, HR Technology               9
Dairy Manager                         9
Director, Accounting                  9
Director, Investments                 4
Director, Labor Relations             4
Director, Compensation                4
Director, Audit                       4
Director, Training                    4
Name: job_title, dtype: int64
46    4422
18    3876
42    3827
21    3211
43    2896
16    2513
5     2067
15    2061
26    2048
8     1785
41    1765
31    1560
44    1520
29    1401
32    1228
6     1167
35    1143
33     925
17     901
36     898
30     806
28     703
1      681
22     648
12     621
40     617
38     613
25     545
2      520
37     463
11     322
19     317
20     254
13     236
39     231
14     182
27     136
9      129
3      126
23      65
45      60
24      57
7       43
34      37
10      18
4        9
Name: store_name, dtype: int64
F    25898
M    23755
Name: gender_short, dtype: int64
Female    25898
Male      23755
Name: gender_full, dtype: int64
Not Applicable    48168
Retirement          885
Resignaton          385
Layoff              215
Name: termreason_desc, dtype: int64
Not Applicable    48168
Voluntary          1270
Involuntary         215
Name: termtype_desc, dtype: int64
2013    5320
2012    5231
2014    5215
2011    5082
2010    4963
2015    4961
2009    4852
2008    4767
2007    4683
2006    4579
Name: STATUS_YEAR, dtype: int64
ACTIVE        48168
TERMINATED     1485
Name: STATUS, dtype: int64
STORES        49068
HEADOFFICE      585
Name: BUSINESS_UNIT, dtype: int64

Plotting and exploratory analysis

In [ ]:
#charts and plots

g = sns.countplot(data=data, x="STATUS")
g.tick_params(labelsize=11)
g.set_title('Dataset 2 Attrition count plot')
plt.show() 

dd = data
sns.countplot(data=dd,  x='gender_short', hue='STATUS')
plt.show()
sns.countplot(data=dd,  x='BUSINESS_UNIT', hue='STATUS')
plt.show()
sns.countplot(data=dd,  x='department_name', hue='STATUS')
plt.xticks(rotation = 90)
plt.show()
sns.countplot(data=dd,  x='age', hue= 'STATUS')
plt.xticks(rotation = 90)
plt.show()


bins = 10
ax1 = sns.histplot(data=dd, x="age", hue='STATUS', multiple="dodge",bins=bins,alpha=.8,kde=True)
sns.move_legend(ax1, "upper right")
ax1.set_title('Age count plot and status')
plt.show()



male = dd[dd['gender_short']=='M']
fem = dd[dd['gender_short']=='F']
ter_M =  male[male['STATUS']=='TERMINATED']
ter_F = fem[fem['STATUS']=='TERMINATED']


import matplotlib.ticker as mtick

plot = {'Gender': ('Female','Male') , 'Attrition rate': (len(ter_F) / len(fem) * 100, len(ter_M) / len(male) * 100)}
plot = pd.DataFrame(plot)
fig = sns.barplot(data = plot, x='Gender',y='Attrition rate')
fig.yaxis.set_major_formatter(mtick.PercentFormatter(100))
plt.title("Attrition rate by gender")
plt.show()


#encoding for charts
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for x in dd.columns:
 label = le.fit_transform(dd[x])
 dd.drop(x, axis=1, inplace=True)
 dd[x] = label


sns.pairplot(data=dd, hue='STATUS')


#ditribution plot
df1 = dd.iloc[: , :6]
df2 = dd.iloc[: , 6:12]

sns.color_palette("tab10")
n_rows=2
n_cols=3
palette = ('dodgerblue' ,'red','blue' ,'orange','black' ,'purple')
palette = iter(palette)
# Create the subplots
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols,figsize=(11, 8))
for i, column in enumerate(df1.columns):
     c = next(palette)
     sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c)

n_rows=2
n_cols=3
palette = ('green' ,'black','deeppink' ,'magenta','cyan' ,'deeppink')
palette = iter(palette)
# Create the subplots
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols,figsize=(11, 8))
for i, column in enumerate(df2.columns):
     c = next(palette)
     sns.distplot(df2[column],ax=axes[i//n_cols,i%n_cols],color=c)
plt.show()


data['STATUS'] = data['STATUS'].replace('ACTIVE',0)
data['STATUS'] = data['STATUS'].replace('TERMINATED',1)
C:\Users\santi\AppData\Local\Temp\ipykernel_17924\1472844368.py:70: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c)
C:\Users\santi\AppData\Local\Temp\ipykernel_17924\1472844368.py:70: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c)
C:\Users\santi\AppData\Local\Temp\ipykernel_17924\1472844368.py:70: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c)
C:\Users\santi\AppData\Local\Temp\ipykernel_17924\1472844368.py:70: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c)
C:\Users\santi\AppData\Local\Temp\ipykernel_17924\1472844368.py:70: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c)
C:\Users\santi\AppData\Local\Temp\ipykernel_17924\1472844368.py:70: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c)
C:\Users\santi\AppData\Local\Temp\ipykernel_17924\1472844368.py:80: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df2[column],ax=axes[i//n_cols,i%n_cols],color=c)
C:\Users\santi\AppData\Local\Temp\ipykernel_17924\1472844368.py:80: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df2[column],ax=axes[i//n_cols,i%n_cols],color=c)
C:\Users\santi\AppData\Local\Temp\ipykernel_17924\1472844368.py:80: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df2[column],ax=axes[i//n_cols,i%n_cols],color=c)

Feature importance and encoding¶

In [ ]:
#X and Y
y = data['STATUS']
x = data.drop(columns=['STATUS'])

#Leave one out encoder
import category_encoders as ce
ce = ce.LeaveOneOutEncoder(cols=['city_name','department_name', 'store_name', 
                                 'gender_short','BUSINESS_UNIT','job_title'],return_df=True)

x = ce.fit_transform(x,y)

# decision tree for feature importance on a classification problem
model = RandomForestClassifier()
# fit the model
model.fit(x,y)
# get importance
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()


x = x[['age','gender_short','BUSINESS_UNIT']]
Feature: 0, Score: 0.17533
Feature: 1, Score: 0.00395
Feature: 2, Score: 0.02174
Feature: 3, Score: 0.02162
Feature: 4, Score: 0.01399
Feature: 5, Score: 0.02186
Feature: 6, Score: 0.16474
Feature: 7, Score: 0.57678

Train test split , multicolinearity and resampling

In [ ]:
#split
X_train, X_test, y_train, y_test = train_test_split(x, y,test_size=0.30)

#adasyn resampling
print("Before resampling: ", Counter(y_train))
from imblearn.over_sampling import ADASYN
ada = ADASYN(random_state=42)
X_train, y_train= ada.fit_resample(X_train,y_train)
print("After ADASYN: ", Counter(y_train))

#correlation matrix
cor = X_train.corr()
plt.figure(figsize=(12,10))
sns.heatmap(cor, cmap=plt.cm.CMRmap_r,annot=True)
plt.show()  
Before resampling:  Counter({0: 33715, 1: 1042})
After ADASYN:  Counter({1: 33719, 0: 33715})

Modelling¶

Ada Boosting and KNN

In [ ]:
#ADA
# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=50,
                         learning_rate=1)
# Train Adaboost Classifer
model = abc.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = model.predict(X_test)


#cv
#create a new KNN model

#train model with cv of 5 
cv_scores = cross_val_score(abc, X_train, y_train, cv=5)
#print each cv score (accuracy) and average them
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))
ADA2 = {'model': 'ADA' ,'precision':precision_score(y_test, y_pred, average='macro'),
 'recall':recall_score(y_test, y_pred, average='macro'),
 'accuracy': accuracy_score(y_test, y_pred),
 'f1': f1_score(y_test, y_pred , average = 'binary'),
  'ROC AUC': roc_auc_score(y_test, y_pred) }

print('ADA')
CM_ADA2 = confucio(y_test, y_pred)

evaluate_model(abc)


#knn

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))


#Checking for k value
neighbors = np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

# Loop over K values
for i, k in enumerate(neighbors):
	knn = KNeighborsClassifier(n_neighbors=k)
	knn.fit(X_train, y_train)
	
	# Compute training and test data accuracy
	train_accuracy[i] = knn.score(X_train, y_train)
	test_accuracy[i] = knn.score(X_test, y_test)

# Generate plot
plt.plot(neighbors, test_accuracy, label = 'Testing dataset Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training dataset Accuracy')
plt.legend()
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy')
plt.show()


#cv
#create a new KNN model
knn_cv = KNeighborsClassifier(n_neighbors=5)
knn_cv.fit(X_train, y_train)
#train model with cv of 5 
cv_scores = cross_val_score(knn_cv, X_train, y_train, cv=5)
#print each cv score (accuracy) and average them
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))

print('knn')
evaluate_model(knn_cv)


KNN2 = {'model': 'KNN' ,'precision':precision_score(y_test, y_pred, average='macro'),
 'recall':recall_score(y_test, y_pred, average='macro'),
 'accuracy': accuracy_score(y_test, y_pred),
 'f1': f1_score(y_test, y_pred , average = 'binary'),
  'ROC AUC': roc_auc_score(y_test, y_pred) }

CM_KNN2 = confucio(y_test, y_pred)
[0.9967376 1.        1.        1.        1.       ]
cv_scores mean:0.9993475198339141
ADA
Train Accuracy : 1.0
Train Confusion Matrix:
[[33715     0]
 [    0 33719]]
--------------------------------------------------
Test Accuracy : 1.0
Test Confusion Matrix:
[[14453     0]
 [    0   443]]
0.9990601503759399
[0.923037   0.99822051 0.98635723 0.99733076 0.98413169]
cv_scores mean:0.9778154390404408
knn
Train Accuracy : 0.9995699498769167
Train Confusion Matrix:
[[33714     1]
 [   28 33691]]
--------------------------------------------------
Test Accuracy : 0.9990601503759399
Test Confusion Matrix:
[[14451     2]
 [   12   431]]

Naive Bayes and Decision Tree

In [ ]:
#NB
nb_model = Pipeline([
        ('classification', GaussianNB())
    ])
nb_model.get_params().keys()
nb_clf = GridSearchCV(estimator=nb_model, param_grid={}, scoring='recall', cv=5)
nb_clf.fit(X_train, y_train)


y_pred = nb_clf.predict(X_test)
model_nb_cm = confucio(y_test, y_pred)
NB2 = {'model': 'NB' ,'precision':precision_score(y_test, y_pred, average='macro'),
 'recall':recall_score(y_test, y_pred, average='macro'),
 'accuracy': accuracy_score(y_test, y_pred),
 'f1': f1_score(y_test, y_pred , average = 'binary'),
  'ROC AUC': roc_auc_score(y_test, y_pred) }

CM_NB2 = confucio(y_test, y_pred)
print('NB')
evaluate_model(nb_clf)

#Decision tree
clf = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=3)
clf = clf.fit(X_train, y_train)

tree.plot_tree(clf)

y_pred= clf.predict(X_test)
model_tree_cm = confucio(y_test, y_pred)


#cv

#train model with cv of 5 
cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
#print each cv score (accuracy) and average them
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))


TREE2 = {'model': 'TREE' ,'precision':precision_score(y_test, y_pred, average='macro'),
 'recall':recall_score(y_test, y_pred, average='macro'),
 'accuracy': accuracy_score(y_test, y_pred),
 'f1': f1_score(y_test, y_pred , average = 'binary'),
  'ROC AUC': roc_auc_score(y_test, y_pred) }
print('TREE')
CM_TREE2 = confucio(y_test, y_pred)

evaluate_model(clf)
NB
Train Accuracy : 0.5047898686122727
Train Confusion Matrix:
[[  365 33350]
 [   44 33675]]
--------------------------------------------------
Test Accuracy : 0.038198174006444686
Test Confusion Matrix:
[[  151 14302]
 [   25   418]]
[0.9967376 1.        1.        1.        1.       ]
cv_scores mean:0.9993475198339141
TREE
Train Accuracy : 1.0
Train Confusion Matrix:
[[33715     0]
 [    0 33719]]
--------------------------------------------------
Test Accuracy : 1.0
Test Confusion Matrix:
[[14453     0]
 [    0   443]]

Random Forest

In [ ]:
#forest
dt = RandomForestClassifier(random_state=42)

params = {
    'max_depth': [1,2,3,4,5,6,7,8,9,10,15, 20],
    'min_samples_leaf': [1,2,3,4,5,6,7,8,9, 10,15, 20, 50, 100],
    'criterion': ["gini", "entropy"]
}
grid_search = GridSearchCV(estimator=dt, 
                           param_grid=params, 
                           cv=5, n_jobs=-1, verbose=1, scoring = "accuracy")

grid_search.fit(X_train, y_train)
score_df = pd.DataFrame(grid_search.cv_results_)
score_df.head()
score_df.nlargest(5,"mean_test_score")
dt_best = grid_search.best_estimator_


FOREST2 = {'model': 'FOREST' ,'precision':precision_score(y_test, y_pred, average='macro'),
 'recall':recall_score(y_test, y_pred, average='macro'),
 'accuracy': accuracy_score(y_test, y_pred),
 'f1': f1_score(y_test, y_pred , average = 'binary'),
  'ROC AUC': roc_auc_score(y_test, y_pred) }

CM_FOREST2 = confucio(y_test, y_pred)
print('Forest')
evaluate_model(grid_search)
Fitting 5 folds for each of 336 candidates, totalling 1680 fits
Forest
Train Accuracy : 0.9993475101580805
Train Confusion Matrix:
[[33715     0]
 [   44 33675]]
--------------------------------------------------
Test Accuracy : 0.9983216970998926
Test Confusion Matrix:
[[14453     0]
 [   25   418]]

Plotting final results¶

In [ ]:
TREE2 = pd.DataFrame(TREE2, index=[0])
FOREST2 =pd.DataFrame(FOREST2,index=[0])
NB2 = pd.DataFrame(NB2,index=[0])
KNN2 = pd.DataFrame(KNN2,index=[0])
ADA2 = pd.DataFrame(ADA2,index=[0])

results2 = pd.concat([TREE2, FOREST2], axis=0)
results2 = pd.concat([results2, NB2], axis=0)
results2 = pd.concat([results2, KNN2], axis=0)
results2 = pd.concat([results2, ADA2], axis=0)
results2['dataset'] = 'dataset 2'


results_final = pd.concat([results2, results], axis=0)

results_final = results_final.melt(id_vars =['model','dataset'], 
              value_vars =['precision','recall','f1','ROC AUC'], var_name = 'metric')


results_final.rename(columns = {'value':'score'}, inplace = True)
print(results_final)


fig,ax = plt.subplots(1,2,figsize=(20, 10))
plt.suptitle('FInal scores', fontsize=25)
a = sns.barplot(data=results_final[results_final['dataset']=='dataset 1'], x='model',y='score',hue= 'metric',ax=ax[0])
a = sns.barplot(data=results_final[results_final['dataset']=='dataset 2'], x='model',y='score',hue= 'metric',ax=ax[1])
     model    dataset     metric     score
0     TREE  dataset 2  precision  1.000000
1   FOREST  dataset 2  precision  1.000000
2       NB  dataset 2  precision  0.443176
3      KNN  dataset 2  precision  1.000000
4      ADA  dataset 2  precision  1.000000
5     TREE  dataset 1  precision  1.000000
6   FOREST  dataset 1  precision  1.000000
7       NB  dataset 1  precision  0.605927
8      KNN  dataset 1  precision  1.000000
9      ADA  dataset 1  precision  1.000000
10    TREE  dataset 2     recall  1.000000
11  FOREST  dataset 2     recall  1.000000
12      NB  dataset 2     recall  0.477007
13     KNN  dataset 2     recall  1.000000
14     ADA  dataset 2     recall  1.000000
15    TREE  dataset 1     recall  1.000000
16  FOREST  dataset 1     recall  1.000000
17      NB  dataset 1     recall  0.628827
18     KNN  dataset 1     recall  1.000000
19     ADA  dataset 1     recall  1.000000
20    TREE  dataset 2         f1  1.000000
21  FOREST  dataset 2         f1  1.000000
22      NB  dataset 2         f1  0.055134
23     KNN  dataset 2         f1  1.000000
24     ADA  dataset 2         f1  1.000000
25    TREE  dataset 1         f1  1.000000
26  FOREST  dataset 1         f1  1.000000
27      NB  dataset 1         f1  0.432210
28     KNN  dataset 1         f1  1.000000
29     ADA  dataset 1         f1  1.000000
30    TREE  dataset 2    ROC AUC  1.000000
31  FOREST  dataset 2    ROC AUC  1.000000
32      NB  dataset 2    ROC AUC  0.477007
33     KNN  dataset 2    ROC AUC  1.000000
34     ADA  dataset 2    ROC AUC  1.000000
35    TREE  dataset 1    ROC AUC  1.000000
36  FOREST  dataset 1    ROC AUC  1.000000
37      NB  dataset 1    ROC AUC  0.628827
38     KNN  dataset 1    ROC AUC  1.000000
39     ADA  dataset 1    ROC AUC  1.000000

AS shown in plot my models for both datasets were able to predict with hight accuracy